Student Name: Zun Wang
Student ID: 915109847
Insert answers to R SNP exercises 1 - 4 here. Submit .Rmd and .html by git.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
vcf.data <- read_tsv("output/SNP_analysis/IMB211_R500.vcf",na = c("","NA","."),comment="#",col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_logical(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_double(),
## X7 = col_logical(),
## X8 = col_character(),
## X9 = col_character(),
## X10 = col_character(),
## X11 = col_character()
## )
vcf.header <- system("grep '#C' output/SNP_analysis/IMB211_R500.vcf",intern = TRUE) #might not work on Windows
vcf.header
## [1] "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tIMB211\tR500"
vcf.header <- vcf.header %>% str_replace("#","") #get rid of the pound sign
vcf.header <- vcf.header %>% str_split(pattern = "\t")
colnames(vcf.data) <- vcf.header[[1]] #we need the [[1]] because str_split returns a list and we want the first element
head(vcf.data)
vcf.data <- vcf.data %>%
filter(str_detect(INFO, "TYPE=snp"))
vcf.data <- vcf.data %>% separate(IMB211,
into = paste("IMB211",c("gt","tot.depth","allele.depth", "ref.depth","ref.qual","alt.depth","alt.qual","gt.lik"),sep="_"), # new column names
sep=":", #separate on ":"
convert=TRUE #converts numeric columns to numeric
)
vcf.data <- vcf.data %>% separate(R500,
into = paste("R500",c("gt","tot.depth","allele.depth","ref.depth","ref.qual","alt.depth","alt.qual","gt.lik"),sep="_"), # new column names
sep=":", #separate on ":"
convert=TRUE #converts numeric columns to numeric
)
Exercise 1
To explore the quality of our data, make a histogram of genotype quality. It is hard to get a feel for the distribution of QUAL scores at the low end of the scale (less than 500) using the default settings, so try making a second histogram that illustrates this region better. (Hint: one option is to subset the data)
pl <- ggplot(data=vcf.data, aes(x=QUAL))
pl <- pl + geom_histogram()
pl
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
vcf.data2 <- filter(vcf.data, QUAL <= 500)
pl2 <- ggplot(data = vcf.data2,aes(x=QUAL))
pl2 <- pl2 + geom_histogram()
pl2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Exercise 2
We only want to keep positions that have a reasonable probabilty of being correct.
a At a quality of 40 what is the probability that the SNP call is wrong?
# P = 10 ^ (-4) = 0.0001
b Subset the data to only keep positions where the quality score is 40 or greater. Put the retained SNPs in an object called vcf.data.good
vcf.data.good <- filter(vcf.data,QUAL >= 40)
c What percentage of SNPs were retained?
nrow(vcf.data.good) / nrow(vcf.data) * 100
## [1] 77.88913
vcf.data.good %>%
filter(IMB211_gt != R500_gt,
IMB211_tot.depth > 20,
R500_tot.depth > 20) %>%
select(CHROM, POS, REF, ALT, IMB211_gt, R500_gt)
Exercise 3: What do the “0/0”, “0/1”, and “1/1” values indicate? Use IGV to look at a few of the positions above (see lab page) and then explain what “0/0”, “0/1”, and “1/1” values indicate.
They are the genotype expression. 0/0 is homozygous reference, 1/1 is homozygous alternate, and 0/1 is heterozygous. Specifically, 0 means in this position, the base in the sample is same as reference; while 1 means the same as alternative.
table(vcf.data.good$IMB211_gt)
##
## 0/0 0/1 0/2 1/1 1/2 1/3 2/2
## 8587 1127 3 26973 33 1 110
table(vcf.data.good$R500_gt)
##
## 0/0 0/1 0/2 1/1 1/2 2/2 3/3
## 7045 1031 6 29202 40 91 1
vcf.data.good %>% select(IMB211_gt, R500_gt) %>% ftable
## R500_gt 0/0 0/1 0/2 1/1 1/2 2/2 3/3
## IMB211_gt
## 0/0 1 318 0 8263 5 0 0
## 0/1 305 418 1 299 2 1 0
## 0/2 0 1 0 1 1 0 0
## 1/1 6737 240 5 17220 19 89 0
## 1/2 2 5 0 11 10 1 1
## 1/3 0 0 0 0 1 0 0
## 2/2 0 2 0 107 1 0 0
Exercise 4
a (From the table generated in the lab), which SNPS would be most useful for a downstream QTL analysis of F2 progeny generated from a cross of IMB211 and R500? (Ignore the allele categories that have “2”, “3”, or “4”). Hint: you want SNPs that will unambiguously distinguish a locus as coming from IMB211 or R500.
The cross between 1/1 and 0/0, while the order of 1/1 and 0/0 assigned to the group does not matter.
b Subset the vcf.data.good data frame so that you only have these SNPs. Place the results in vcf.data.good.F2
vcf.data.good.F2 <- filter(vcf.data.good, (IMB211_gt == "1/1" & R500_gt == "0/0")|(IMB211_gt == "0/0" & R500_gt == "1/1") )
vcf.data.good.F2
c How many SNPS are retained?
15000
Exercise 5
a Using the high quality F2 SNP list from Exercise 4 (vcf.data.good.F2), for each SNP plot its position on the chromosome (x axis), and total read depth (R500 and IMB211 combined) (y axis).
vcf.data.good.F2["totdepth"] <- vcf.data.good.F2$R500_tot.depth+vcf.data.good.F2$IMB211_tot.depth
pl3 <- ggplot(vcf.data.good.F2,aes(x = POS,y = totdepth))+geom_point()
pl3
Optional: color each SNP based on the percentage of reads that are R500. (optional part not graded).
vcf.data.good.F2["R500perc"] <- vcf.data.good.F2$R500_tot.depth/vcf.data.good.F2$totdepth
pl4 <- ggplot(vcf.data.good.F2,aes(x = POS,y = totdepth,color = R500perc))+geom_point()
pl4
b Use the help function to learn about xlim(). Use this function to plot only the region betweeen 20,000,000 and 25,000,000 bp. Why might there be gaps with no SNPs?
pl5 <- pl4 + xlim(20000000,25000000)
pl5
## Warning: Removed 12009 rows containing missing values (geom_point).
For Fun (??)–not graded–
Plot the number of each type of base change (A->G, etc). Are there differences? Is this expected?
baseChange <- data.frame(matrix(nrow = 12,ncol = 0))
baseChange["base_change"] <- c("AtoT","TtoA","AtoC","CtoA","AtoG","GtoA","TtoC","CtoT","TtoG","GtoT","CtoG","GtoC")
baseChange.ac <- select(vcf.data.good.F2,REF,ALT)
baseChange.ac$REF <- as.character(baseChange.ac$REF)
baseChange.ac$ALT <- as.character(baseChange.ac$ALT)
df_a <- baseChange.ac %>%
mutate(REF_new=strsplit(REF, "")) %>%
unnest(REF_new)
baseChange.ac<- baseChange.ac %>%
mutate(ALT=strsplit(ALT, "")) %>%
unnest(ALT)
baseChange.ac$REF <- df_a$REF_new
baseChange.ac["ALTnum"] <- 1
baseChange.ac["REFnum"] <- 1
baseChange.ac$REFnum[which(baseChange.ac$REF=="T")] <- 2
baseChange.ac$REFnum[which(baseChange.ac$REF=="C")] <- 4
baseChange.ac$REFnum[which(baseChange.ac$REF=="G")] <- 8
baseChange.ac$ALTnum[which(baseChange.ac$ALT=="T")] <- 2
baseChange.ac$ALTnum[which(baseChange.ac$ALT=="C")] <- 4
baseChange.ac$ALTnum[which(baseChange.ac$ALT=="G")] <- 8
baseChange.ac["diff"]<-baseChange.ac$REFnum - baseChange.ac$ALTnum
baseChange.ac
baseChange["count"]<-c(nrow(subset(baseChange.ac,diff == -1)),nrow(subset(baseChange.ac,diff == 1)),nrow(subset(baseChange.ac,diff == -3)),nrow(subset(baseChange.ac,diff == 3)),nrow(subset(baseChange.ac,diff == -7)),nrow(subset(baseChange.ac,diff == 7)),nrow(subset(baseChange.ac,diff == -2)),nrow(subset(baseChange.ac,diff == 2)),nrow(subset(baseChange.ac,diff == -6)),nrow(subset(baseChange.ac,diff == 6)),nrow(subset(baseChange.ac,diff == -4)),nrow(subset(baseChange.ac,diff == 4)))
baseChange
pl6 <- ggplot(baseChange,aes(x = base_change,y = count,fill = base_change))+geom_bar(stat = "identity")
pl6